home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyo (Python 2.5)
-
- import difflib
- from lxml import etree
- from lxml.html import fragment_fromstring
- import cgi
- import re
- __all__ = [
- 'html_annotate',
- 'htmldiff']
-
- try:
- _unicode = unicode
- except NameError:
- _unicode = str
-
-
- try:
- basestring = __builtins__['basestring']
- except (KeyError, NameError):
- basestring = str
-
-
- def default_markup(text, version):
- return '<span title="%s">%s</span>' % (cgi.escape(_unicode(version), 1), text)
-
-
- def html_annotate(doclist, markup = default_markup):
- tokenlist = [ tokenize_annotated(doc, version) for doc, version in doclist ]
- cur_tokens = tokenlist[0]
- for tokens in tokenlist[1:]:
- html_annotate_merge_annotations(cur_tokens, tokens)
- cur_tokens = tokens
-
- cur_tokens = compress_tokens(cur_tokens)
- result = markup_serialize_tokens(cur_tokens, markup)
- return ''.join(result).strip()
-
-
- def tokenize_annotated(doc, annotation):
- tokens = tokenize(doc, include_hrefs = False)
- for tok in tokens:
- tok.annotation = annotation
-
- return tokens
-
-
- def html_annotate_merge_annotations(tokens_old, tokens_new):
- s = InsensitiveSequenceMatcher(a = tokens_old, b = tokens_new)
- commands = s.get_opcodes()
- for command, i1, i2, j1, j2 in commands:
- if command == 'equal':
- eq_old = tokens_old[i1:i2]
- eq_new = tokens_new[j1:j2]
- copy_annotations(eq_old, eq_new)
- continue
-
-
-
- def copy_annotations(src, dest):
- for src_tok, dest_tok in zip(src, dest):
- dest_tok.annotation = src_tok.annotation
-
-
-
- def compress_tokens(tokens):
- result = [
- tokens[0]]
- for tok in tokens[1:]:
- if not (result[-1].post_tags) and not (tok.pre_tags) and result[-1].annotation == tok.annotation:
- compress_merge_back(result, tok)
- continue
- result.append(tok)
-
- return result
-
-
- def compress_merge_back(tokens, tok):
- last = tokens[-1]
- if type(last) is not token or type(tok) is not token:
- tokens.append(tok)
- else:
- text = _unicode(last)
- if last.trailing_whitespace:
- text += ' '
-
- text += tok
- merged = token(text, pre_tags = last.pre_tags, post_tags = tok.post_tags, trailing_whitespace = tok.trailing_whitespace)
- merged.annotation = last.annotation
- tokens[-1] = merged
-
-
- def markup_serialize_tokens(tokens, markup_func):
- for token in tokens:
- for pre in token.pre_tags:
- yield pre
-
- html = token.html()
- html = markup_func(html, token.annotation)
- if token.trailing_whitespace:
- html += ' '
-
- yield html
- for post in token.post_tags:
- yield post
-
-
-
-
- def htmldiff(old_html, new_html):
- old_html_tokens = tokenize(old_html)
- new_html_tokens = tokenize(new_html)
- result = htmldiff_tokens(old_html_tokens, new_html_tokens)
- result = ''.join(result).strip()
- return fixup_ins_del_tags(result)
-
-
- def htmldiff_tokens(html1_tokens, html2_tokens):
- s = InsensitiveSequenceMatcher(a = html1_tokens, b = html2_tokens)
- commands = s.get_opcodes()
- result = []
- for command, i1, i2, j1, j2 in commands:
- if command == 'equal':
- result.extend(expand_tokens(html2_tokens[j1:j2], equal = True))
- continue
-
- if command == 'insert' or command == 'replace':
- ins_tokens = expand_tokens(html2_tokens[j1:j2])
- merge_insert(ins_tokens, result)
-
- if command == 'delete' or command == 'replace':
- del_tokens = expand_tokens(html1_tokens[i1:i2])
- merge_delete(del_tokens, result)
- continue
-
- result = cleanup_delete(result)
- return result
-
-
- def expand_tokens(tokens, equal = False):
- for token in tokens:
- for pre in token.pre_tags:
- yield pre
-
- if not equal or not (token.hide_when_equal):
- if token.trailing_whitespace:
- yield token.html() + ' '
- else:
- yield token.html()
-
- for post in token.post_tags:
- yield post
-
-
-
-
- def merge_insert(ins_chunks, doc):
- (unbalanced_start, balanced, unbalanced_end) = split_unbalanced(ins_chunks)
- doc.extend(unbalanced_start)
- if doc and not doc[-1].endswith(' '):
- doc[-1] += ' '
-
- doc.append('<ins>')
- if balanced and balanced[-1].endswith(' '):
- balanced[-1] = balanced[-1][:-1]
-
- doc.extend(balanced)
- doc.append('</ins> ')
- doc.extend(unbalanced_end)
-
-
- class DEL_START:
- pass
-
-
- class DEL_END:
- pass
-
-
- class NoDeletes(Exception):
- pass
-
-
- def merge_delete(del_chunks, doc):
- doc.append(DEL_START)
- doc.extend(del_chunks)
- doc.append(DEL_END)
-
-
- def cleanup_delete(chunks):
- while None:
-
- try:
- (pre_delete, delete, post_delete) = split_delete(chunks)
- except NoDeletes:
- break
-
- (unbalanced_start, balanced, unbalanced_end) = split_unbalanced(delete)
- locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
- doc = pre_delete
- if doc and not doc[-1].endswith(' '):
- doc[-1] += ' '
-
- doc.append('<del>')
- if balanced and balanced[-1].endswith(' '):
- balanced[-1] = balanced[-1][:-1]
-
- doc.extend(balanced)
- doc.append('</del> ')
- doc.extend(post_delete)
- chunks = doc
- continue
- return chunks
-
-
- def split_unbalanced(chunks):
- start = []
- end = []
- tag_stack = []
- balanced = []
- for chunk in chunks:
- if not chunk.startswith('<'):
- balanced.append(chunk)
- continue
-
- endtag = chunk[1] == '/'
- name = chunk.split()[0].strip('<>/')
- if name in empty_tags:
- balanced.append(chunk)
- continue
-
- if endtag:
- if tag_stack and tag_stack[-1][0] == name:
- balanced.append(chunk)
- (name, pos, tag) = tag_stack.pop()
- balanced[pos] = tag
- elif tag_stack:
- []([ tag for name, pos, tag in tag_stack ])
- tag_stack = []
- end.append(chunk)
- else:
- end.append(chunk)
- tag_stack[-1][0] == name
- tag_stack.append((name, len(balanced), chunk))
- balanced.append(None)
-
- []([ chunk for name, pos, chunk in tag_stack ])
- balanced = _[3]
- return (start, balanced, end)
-
-
- def split_delete(chunks):
-
- try:
- pos = chunks.index(DEL_START)
- except ValueError:
- raise NoDeletes
-
- pos2 = chunks.index(DEL_END)
- return (chunks[:pos], chunks[pos + 1:pos2], chunks[pos2 + 1:])
-
-
- def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
- while not unbalanced_start:
- break
- finding = unbalanced_start[0]
- finding_name = finding.split()[0].strip('<>')
- if not post_delete:
- break
-
- next = post_delete[0]
- if next is DEL_START or not next.startswith('<'):
- break
-
- if next[1] == '/':
- break
-
- name = next.split()[0].strip('<>')
- if name == 'ins':
- break
-
- if name == finding_name:
- unbalanced_start.pop(0)
- pre_delete.append(post_delete.pop(0))
- continue
- break
- continue
-
-
- def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
- while not unbalanced_end:
- break
- finding = unbalanced_end[-1]
- finding_name = finding.split()[0].strip('<>/')
- if not pre_delete:
- break
-
- next = pre_delete[-1]
- if next is DEL_END or not next.startswith('</'):
- break
-
- name = next.split()[0].strip('<>/')
- if name == 'ins' or name == 'del':
- break
-
- if name == finding_name:
- unbalanced_end.pop()
- post_delete.insert(0, pre_delete.pop())
- continue
- break
- continue
-
-
- class token(_unicode):
- hide_when_equal = False
-
- def __new__(cls, text, pre_tags = None, post_tags = None, trailing_whitespace = False):
- obj = _unicode.__new__(cls, text)
- if pre_tags is not None:
- obj.pre_tags = pre_tags
- else:
- obj.pre_tags = []
- if post_tags is not None:
- obj.post_tags = post_tags
- else:
- obj.post_tags = []
- obj.trailing_whitespace = trailing_whitespace
- return obj
-
-
- def __repr__(self):
- return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
-
-
- def html(self):
- return _unicode(self)
-
-
-
- class tag_token(token):
-
- def __new__(cls, tag, data, html_repr, pre_tags = None, post_tags = None, trailing_whitespace = False):
- obj = token.__new__(cls, '%s: %s' % (type, data), pre_tags = pre_tags, post_tags = post_tags, trailing_whitespace = trailing_whitespace)
- obj.tag = tag
- obj.data = data
- obj.html_repr = html_repr
- return obj
-
-
- def __repr__(self):
- return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (self.tag, self.data, self.html_repr, self.pre_tags, self.post_tags, self.trailing_whitespace)
-
-
- def html(self):
- return self.html_repr
-
-
-
- class href_token(token):
- hide_when_equal = True
-
- def html(self):
- return 'Link: %s' % self
-
-
-
- def tokenize(html, include_hrefs = True):
- body_el = parse_html(html, cleanup = True)
- chunks = flatten_el(body_el, skip_tag = True, include_hrefs = include_hrefs)
- return fixup_chunks(chunks)
-
-
- def parse_html(html, cleanup = True):
- if cleanup:
- html = cleanup_html(html)
-
- return fragment_fromstring(html, create_parent = True)
-
- _body_re = re.compile('<body.*?>', re.I | re.S)
- _end_body_re = re.compile('</body.*?>', re.I | re.S)
- _ins_del_re = re.compile('</?(ins|del).*?>', re.I | re.S)
-
- def cleanup_html(html):
- match = _body_re.search(html)
- if match:
- html = html[match.end():]
-
- match = _end_body_re.search(html)
- if match:
- html = html[:match.start()]
-
- html = _ins_del_re.sub('', html)
- return html
-
- end_whitespace_re = re.compile('[ \\t\\n\\r]$')
-
- def fixup_chunks(chunks):
- tag_accum = []
- cur_word = None
- result = []
- for chunk in chunks:
- if isinstance(chunk, tuple):
- if chunk[0] == 'img':
- src = chunk[1]
- tag = chunk[2]
- if tag.endswith(' '):
- tag = tag[:-1]
- trailing_whitespace = True
- else:
- trailing_whitespace = False
- cur_word = tag_token('img', src, html_repr = tag, pre_tags = tag_accum, trailing_whitespace = trailing_whitespace)
- tag_accum = []
- result.append(cur_word)
- continue
- if chunk[0] == 'href':
- href = chunk[1]
- cur_word = href_token(href, pre_tags = tag_accum, trailing_whitespace = True)
- tag_accum = []
- result.append(cur_word)
- continue
- continue
-
- if is_word(chunk):
- if chunk.endswith(' '):
- chunk = chunk[:-1]
- trailing_whitespace = True
- else:
- trailing_whitespace = False
- cur_word = token(chunk, pre_tags = tag_accum, trailing_whitespace = trailing_whitespace)
- tag_accum = []
- result.append(cur_word)
- continue
- if is_start_tag(chunk):
- tag_accum.append(chunk)
- continue
- if is_end_tag(chunk):
- if tag_accum:
- tag_accum.append(chunk)
- else:
- cur_word.post_tags.append(chunk)
- tag_accum
-
- if not result:
- return [
- token('', pre_tags = tag_accum)]
- else:
- result[-1].post_tags.extend(tag_accum)
- return result
-
- empty_tags = ('param', 'img', 'area', 'br', 'basefont', 'input', 'base', 'meta', 'link', 'col')
- block_level_tags = ('address', 'blockquote', 'center', 'dir', 'div', 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'isindex', 'menu', 'noframes', 'noscript', 'ol', 'p', 'pre', 'table', 'ul')
- block_level_container_tags = ('dd', 'dt', 'frameset', 'li', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr')
-
- def flatten_el(el, include_hrefs, skip_tag = False):
- if not skip_tag:
- if el.tag == 'img':
- yield ('img', el.attrib['src'], start_tag(el))
- else:
- yield start_tag(el)
-
- if el.tag in empty_tags and not (el.text) and not len(el) and not (el.tail):
- return None
-
- start_words = split_words(el.text)
- for word in start_words:
- yield cgi.escape(word)
-
- for child in el:
- for item in flatten_el(child, include_hrefs = include_hrefs):
- yield item
-
-
- if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
- yield ('href', el.attrib['href'])
-
- if not skip_tag:
- yield end_tag(el)
- end_words = split_words(el.tail)
- for word in end_words:
- yield cgi.escape(word)
-
-
-
-
- def split_words(text):
- if not text or not text.strip():
- return []
-
- words = [ w + ' ' for w in text.strip().split() ]
- return words
-
- start_whitespace_re = re.compile('^[ \\t\\n\\r]')
-
- def start_tag(el):
- return ''.join % ([], []([ ' %s="%s"' % (name, cgi.escape(value, True)) for name, value in el.attrib.items() ]))
-
-
- def end_tag(el):
- if el.tail and start_whitespace_re.search(el.tail):
- extra = ' '
- else:
- extra = ''
- return '</%s>%s' % (el.tag, extra)
-
-
- def is_word(tok):
- return not tok.startswith('<')
-
-
- def is_end_tag(tok):
- return tok.startswith('</')
-
-
- def is_start_tag(tok):
- if tok.startswith('<'):
- pass
- return not tok.startswith('</')
-
-
- def fixup_ins_del_tags(html):
- doc = parse_html(html, cleanup = False)
- _fixup_ins_del_tags(doc)
- html = serialize_html_fragment(doc, skip_outer = True)
- return html
-
-
- def serialize_html_fragment(el, skip_outer = False):
- html = etree.tostring(el, method = 'html', encoding = _unicode)
- if skip_outer:
- html = html[html.find('>') + 1:]
- html = html[:html.rfind('<')]
- return html.strip()
- else:
- return html
-
-
- def _fixup_ins_del_tags(doc):
- for tag in [
- 'ins',
- 'del']:
- for el in doc.xpath('descendant-or-self::%s' % tag):
- if not _contains_block_level_tag(el):
- continue
-
- _move_el_inside_block(el, tag = tag)
- el.drop_tag()
-
-
-
-
- def _contains_block_level_tag(el):
- if el.tag in block_level_tags or el.tag in block_level_container_tags:
- return True
-
- for child in el:
- if _contains_block_level_tag(child):
- return True
- continue
-
- return False
-
-
- def _move_el_inside_block(el, tag):
- for child in el:
- if _contains_block_level_tag(child):
- break
- continue
- else:
- import sys
- children_tag = etree.Element(tag)
- children_tag.text = el.text
- el.text = None
- el[:] = [
- children_tag]
- return None
- for child in list(el):
- if _contains_block_level_tag(child):
- _move_el_inside_block(child, tag)
- if child.tail:
- tail_tag = etree.Element(tag)
- tail_tag.text = child.tail
- child.tail = None
- el.insert(el.index(child) + 1, tail_tag)
-
- child.tail
- child_tag = etree.Element(tag)
- el.replace(child, child_tag)
- child_tag.append(child)
-
- if el.text:
- text_tag = etree.Element(tag)
- text_tag.text = el.text
- el.text = None
- el.insert(0, text_tag)
-
-
-
- def _merge_element_contents(el):
- parent = el.getparent()
- if not el.text:
- pass
- text = ''
- if el.tail:
- if not len(el):
- text += el.tail
- elif el[-1].tail:
- el[-1].tail += el.tail
- else:
- el[-1].tail = el.tail
-
- index = parent.index(el)
- if text:
- if index == 0:
- previous = None
- else:
- previous = parent[index - 1]
- if previous is None:
- if parent.text:
- parent.text += text
- else:
- parent.text = text
- elif previous.tail:
- previous.tail += text
- else:
- previous.tail = text
-
- parent[index:index + 1] = el.getchildren()
-
-
- class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
- threshold = 2
-
- def get_matching_blocks(self):
- size = min(len(self.b), len(self.b))
- threshold = min(self.threshold, size / 4)
- actual = difflib.SequenceMatcher.get_matching_blocks(self)
- return _[1]
-
-
- if __name__ == '__main__':
- from lxml.html import _diffcommand
- _diffcommand.main()
-
-